In [171]:
import os
import requests
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

Derick Carlson Final Project¶

* Data Choice * Data Prep * Data Details * Exploratory Data Analysis * Addtional Data Analysis * Model of Crab Size Over Time

Data Choice¶

  • Source: Kaggle
  • Description: Crab age, sex, and size measurements
  • Why: Upon inspection, the Dataset appeared clean, large, and had interesting dimensions to explore without being overwhelming.

Data Prep¶

  • Download and Retrieval
  • Remove "0"s and na
  • New Calculated Variables

Download and Retrieval¶

In [130]:
# URL of file
url = "https://www.kaggle.com/datasets/shalfey/extended-crab-age-prediction/download?datasetVersionNumber=1"
In [131]:
# File path to save the downloaded file
file_path = "Crabs.csv"
In [132]:
def download_file(url, file_path):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print("File downloaded successfully.")
    else:
        print("Failed to download file.")
In [133]:
def check_and_download_file(url, file_path):
    if not os.path.isfile(file_path):
        download_file(url, file_path)
    else:
        print("File already exists.")
In [134]:
# Check if the file exists and download it if necessary
check_and_download_file(url, file_path)
File already exists.
In [135]:
df = pd.read_csv(file_path)

Remove "0"s and na¶

In [136]:
df.replace(0, float('nan'), inplace=True)
df.dropna(inplace=True)
In [137]:
# Get the count of NA values in each column
na_counts = df.isna().sum()
null_counts = df.isnull().sum()
print("NA counts:")
print(na_counts)
print("Null counts:")
print(null_counts)
NA counts:
id                0
Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
Age               0
dtype: int64
Null counts:
id                0
Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
Age               0
dtype: int64

New Calculated Variable¶

Because many of the variables will be colinear, I want to produce two calculated variables called Volume and Density

In [138]:
# Volume
df['Volume'] = np.pi * (df['Diameter'] / 2)**2 * df['Height']
In [139]:
 # Density
df['Density'] = df['Weight'] / df['Volume']

Data Details¶

  • Rows and Column Counts
  • Info
  • Top Rows Sample

Rows and Column Counts¶

In [140]:
print(df.shape)
(199852, 12)

Info¶

In [141]:
# info
print(df.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 199852 entries, 1 to 199999
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              199852 non-null  float64
 1   Sex             199852 non-null  object 
 2   Length          199852 non-null  float64
 3   Diameter        199852 non-null  float64
 4   Height          199852 non-null  float64
 5   Weight          199852 non-null  float64
 6   Shucked Weight  199852 non-null  float64
 7   Viscera Weight  199852 non-null  float64
 8   Shell Weight    199852 non-null  float64
 9   Age             199852 non-null  float64
 10  Volume          199852 non-null  float64
 11  Density         199852 non-null  float64
dtypes: float64(11), object(1)
memory usage: 19.8+ MB
None

Top Rows Sample¶

In [142]:
df.head()
Out[142]:
id Sex Length Diameter Height Weight Shucked Weight Viscera Weight Shell Weight Age Volume Density
1 1.0 I 1.2375 1.0000 0.3750 21.885814 7.654365 3.798833 7.654365 19.0 0.294524 74.309024
2 2.0 F 1.4500 1.1625 0.4125 28.250277 11.127179 7.016501 7.257472 11.0 0.437824 64.524248
3 3.0 I 1.3500 1.0250 0.3750 21.588144 9.738053 4.110678 6.378637 9.0 0.309435 69.766419
4 4.0 I 1.1375 0.8750 0.2875 14.968536 5.953395 2.962523 3.713785 8.0 0.172880 86.583570
5 5.0 F 1.4875 1.1875 0.4000 28.335325 12.048538 7.668540 8.504850 10.0 0.443014 63.960388

Exploratory Data Analysis¶

  • Descriptive Statistics
  • Outliers
  • Differences in Categoric Variables
  • Correlation Matrix in Measurements

Descriptive Statistics¶

In [143]:
# Descriptive statistics
print(df.describe())
                  id         Length       Diameter         Height  \
count  199852.000000  199852.000000  199852.000000  199852.000000   
mean    99996.314633       1.313000       1.020788       0.346273   
std     57734.152658       0.289181       0.238135       0.090756   
min         1.000000       0.187500       0.112500       0.012500   
25%     49996.750000       1.150000       0.875000       0.287500   
50%     99999.500000       1.375000       1.075000       0.362500   
75%    149994.250000       1.525000       1.200000       0.412500   
max    199999.000000       7.583491       2.250000       2.825000   

              Weight  Shucked Weight  Viscera Weight   Shell Weight  \
count  199852.000000   199852.000000   199852.000000  199852.000000   
mean       23.138912        9.996106        4.996516       6.638642   
std        12.589390        5.600855        2.783996       3.554264   
min         0.028349        0.014175        0.014175       0.042524   
25%        13.168343        5.669900        2.806601       3.827183   
50%        23.530085        9.851451        4.890289       6.803880   
75%        32.077459       13.933779        6.959802       9.029316   
max        80.101512       45.274152       26.124064      37.038622   

                 Age         Volume        Density  
count  199852.000000  199852.000000  199852.000000  
mean        9.954897       0.328318      73.311126  
std         3.214314       0.186511      13.717132  
min         1.000000       0.000249       4.242654  
25%         8.000000       0.179841      66.508786  
50%        10.000000       0.330064      71.818174  
75%        11.000000       0.461725      77.949224  
max        29.000000       1.898418    1654.163642  

Outliers¶

In [144]:
threshold = 3

z_scores = (df['Density'] - df['Density'].mean()) / df['Density'].std()
        
# Identify the outliers using the threshold
outliers = df[abs(z_scores) > threshold]
        
# Remove the outliers from the DataFrame
df = df.drop(outliers.index)

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

Categoric Variables¶

  • Sex: M/F
  • Age is both numeric and categoric.
In [145]:
# setup
variables = ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Volume', 'Density']
ages = np.sort(df['Age'].unique())

grouped = df.groupby(['Sex', 'Age'])
In [146]:
for age in ages:
    print(f"\nAge: {age}")
    print("-------------------------")
    group = df[df['Age'] == age]
    for var in variables:
        group1 = group[group['Sex'] == 'M'][var]
        group2 = group[group['Sex'] == 'F'][var]
        # Check if there are sufficient data points in each group
        if len(group1) < 2 or len(group2) < 2:
            print(f"For Age={age}, Variable={var}: Insufficient data")
            continue
        t_statistic, p_value = stats.ttest_ind(group1, group2)
        print(f"Age={age}, Variable={var}: t-statistic={t_statistic:.3f}, p-value={p_value:.3f}")
        if p_value < 0.05:
            print("Reject null")
Age: 1.0
-------------------------
For Age=1.0, Variable=Length: Insufficient data
For Age=1.0, Variable=Diameter: Insufficient data
For Age=1.0, Variable=Height: Insufficient data
For Age=1.0, Variable=Weight: Insufficient data
For Age=1.0, Variable=Shucked Weight: Insufficient data
For Age=1.0, Variable=Viscera Weight: Insufficient data
For Age=1.0, Variable=Shell Weight: Insufficient data
For Age=1.0, Variable=Volume: Insufficient data
For Age=1.0, Variable=Density: Insufficient data

Age: 2.0
-------------------------
For Age=2.0, Variable=Length: Insufficient data
For Age=2.0, Variable=Diameter: Insufficient data
For Age=2.0, Variable=Height: Insufficient data
For Age=2.0, Variable=Weight: Insufficient data
For Age=2.0, Variable=Shucked Weight: Insufficient data
For Age=2.0, Variable=Viscera Weight: Insufficient data
For Age=2.0, Variable=Shell Weight: Insufficient data
For Age=2.0, Variable=Volume: Insufficient data
For Age=2.0, Variable=Density: Insufficient data

Age: 3.0
-------------------------
For Age=3.0, Variable=Length: Insufficient data
For Age=3.0, Variable=Diameter: Insufficient data
For Age=3.0, Variable=Height: Insufficient data
For Age=3.0, Variable=Weight: Insufficient data
For Age=3.0, Variable=Shucked Weight: Insufficient data
For Age=3.0, Variable=Viscera Weight: Insufficient data
For Age=3.0, Variable=Shell Weight: Insufficient data
For Age=3.0, Variable=Volume: Insufficient data
For Age=3.0, Variable=Density: Insufficient data

Age: 4.0
-------------------------
Age=4.0, Variable=Length: t-statistic=0.022, p-value=0.983
Age=4.0, Variable=Diameter: t-statistic=0.440, p-value=0.661
Age=4.0, Variable=Height: t-statistic=0.949, p-value=0.346
Age=4.0, Variable=Weight: t-statistic=0.373, p-value=0.710
Age=4.0, Variable=Shucked Weight: t-statistic=0.514, p-value=0.609
Age=4.0, Variable=Viscera Weight: t-statistic=0.371, p-value=0.712
Age=4.0, Variable=Shell Weight: t-statistic=0.374, p-value=0.709
Age=4.0, Variable=Volume: t-statistic=0.631, p-value=0.530
Age=4.0, Variable=Density: t-statistic=-1.188, p-value=0.239

Age: 5.0
-------------------------
Age=5.0, Variable=Length: t-statistic=-2.918, p-value=0.004
Reject null
Age=5.0, Variable=Diameter: t-statistic=-2.689, p-value=0.008
Reject null
Age=5.0, Variable=Height: t-statistic=-1.951, p-value=0.052
Age=5.0, Variable=Weight: t-statistic=-2.999, p-value=0.003
Reject null
Age=5.0, Variable=Shucked Weight: t-statistic=-1.903, p-value=0.058
Age=5.0, Variable=Viscera Weight: t-statistic=-3.201, p-value=0.002
Reject null
Age=5.0, Variable=Shell Weight: t-statistic=-3.737, p-value=0.000
Reject null
Age=5.0, Variable=Volume: t-statistic=-2.459, p-value=0.015
Reject null
Age=5.0, Variable=Density: t-statistic=-1.446, p-value=0.149

Age: 6.0
-------------------------
Age=6.0, Variable=Length: t-statistic=-8.084, p-value=0.000
Reject null
Age=6.0, Variable=Diameter: t-statistic=-8.110, p-value=0.000
Reject null
Age=6.0, Variable=Height: t-statistic=-7.552, p-value=0.000
Reject null
Age=6.0, Variable=Weight: t-statistic=-7.307, p-value=0.000
Reject null
Age=6.0, Variable=Shucked Weight: t-statistic=-6.915, p-value=0.000
Reject null
Age=6.0, Variable=Viscera Weight: t-statistic=-7.676, p-value=0.000
Reject null
Age=6.0, Variable=Shell Weight: t-statistic=-7.328, p-value=0.000
Reject null
Age=6.0, Variable=Volume: t-statistic=-7.531, p-value=0.000
Reject null
Age=6.0, Variable=Density: t-statistic=1.024, p-value=0.306

Age: 7.0
-------------------------
Age=7.0, Variable=Length: t-statistic=-10.670, p-value=0.000
Reject null
Age=7.0, Variable=Diameter: t-statistic=-10.882, p-value=0.000
Reject null
Age=7.0, Variable=Height: t-statistic=-10.272, p-value=0.000
Reject null
Age=7.0, Variable=Weight: t-statistic=-9.683, p-value=0.000
Reject null
Age=7.0, Variable=Shucked Weight: t-statistic=-8.757, p-value=0.000
Reject null
Age=7.0, Variable=Viscera Weight: t-statistic=-10.465, p-value=0.000
Reject null
Age=7.0, Variable=Shell Weight: t-statistic=-9.789, p-value=0.000
Reject null
Age=7.0, Variable=Volume: t-statistic=-9.693, p-value=0.000
Reject null
Age=7.0, Variable=Density: t-statistic=2.349, p-value=0.019
Reject null

Age: 8.0
-------------------------
Age=8.0, Variable=Length: t-statistic=-8.738, p-value=0.000
Reject null
Age=8.0, Variable=Diameter: t-statistic=-9.103, p-value=0.000
Reject null
Age=8.0, Variable=Height: t-statistic=-9.177, p-value=0.000
Reject null
Age=8.0, Variable=Weight: t-statistic=-7.967, p-value=0.000
Reject null
Age=8.0, Variable=Shucked Weight: t-statistic=-6.716, p-value=0.000
Reject null
Age=8.0, Variable=Viscera Weight: t-statistic=-8.722, p-value=0.000
Reject null
Age=8.0, Variable=Shell Weight: t-statistic=-9.235, p-value=0.000
Reject null
Age=8.0, Variable=Volume: t-statistic=-8.935, p-value=0.000
Reject null
Age=8.0, Variable=Density: t-statistic=4.722, p-value=0.000
Reject null

Age: 9.0
-------------------------
Age=9.0, Variable=Length: t-statistic=-13.077, p-value=0.000
Reject null
Age=9.0, Variable=Diameter: t-statistic=-13.152, p-value=0.000
Reject null
Age=9.0, Variable=Height: t-statistic=-13.283, p-value=0.000
Reject null
Age=9.0, Variable=Weight: t-statistic=-12.271, p-value=0.000
Reject null
Age=9.0, Variable=Shucked Weight: t-statistic=-10.398, p-value=0.000
Reject null
Age=9.0, Variable=Viscera Weight: t-statistic=-13.269, p-value=0.000
Reject null
Age=9.0, Variable=Shell Weight: t-statistic=-13.800, p-value=0.000
Reject null
Age=9.0, Variable=Volume: t-statistic=-13.072, p-value=0.000
Reject null
Age=9.0, Variable=Density: t-statistic=4.224, p-value=0.000
Reject null

Age: 10.0
-------------------------
Age=10.0, Variable=Length: t-statistic=-8.306, p-value=0.000
Reject null
Age=10.0, Variable=Diameter: t-statistic=-8.575, p-value=0.000
Reject null
Age=10.0, Variable=Height: t-statistic=-10.274, p-value=0.000
Reject null
Age=10.0, Variable=Weight: t-statistic=-8.655, p-value=0.000
Reject null
Age=10.0, Variable=Shucked Weight: t-statistic=-5.889, p-value=0.000
Reject null
Age=10.0, Variable=Viscera Weight: t-statistic=-10.045, p-value=0.000
Reject null
Age=10.0, Variable=Shell Weight: t-statistic=-9.865, p-value=0.000
Reject null
Age=10.0, Variable=Volume: t-statistic=-10.343, p-value=0.000
Reject null
Age=10.0, Variable=Density: t-statistic=6.362, p-value=0.000
Reject null

Age: 11.0
-------------------------
Age=11.0, Variable=Length: t-statistic=-7.473, p-value=0.000
Reject null
Age=11.0, Variable=Diameter: t-statistic=-7.284, p-value=0.000
Reject null
Age=11.0, Variable=Height: t-statistic=-7.636, p-value=0.000
Reject null
Age=11.0, Variable=Weight: t-statistic=-7.349, p-value=0.000
Reject null
Age=11.0, Variable=Shucked Weight: t-statistic=-5.065, p-value=0.000
Reject null
Age=11.0, Variable=Viscera Weight: t-statistic=-8.440, p-value=0.000
Reject null
Age=11.0, Variable=Shell Weight: t-statistic=-8.430, p-value=0.000
Reject null
Age=11.0, Variable=Volume: t-statistic=-8.163, p-value=0.000
Reject null
Age=11.0, Variable=Density: t-statistic=2.824, p-value=0.005
Reject null

Age: 12.0
-------------------------
Age=12.0, Variable=Length: t-statistic=-0.905, p-value=0.365
Age=12.0, Variable=Diameter: t-statistic=-1.032, p-value=0.302
Age=12.0, Variable=Height: t-statistic=-1.565, p-value=0.118
Age=12.0, Variable=Weight: t-statistic=-0.808, p-value=0.419
Age=12.0, Variable=Shucked Weight: t-statistic=0.672, p-value=0.501
Age=12.0, Variable=Viscera Weight: t-statistic=-0.803, p-value=0.422
Age=12.0, Variable=Shell Weight: t-statistic=-1.882, p-value=0.060
Age=12.0, Variable=Volume: t-statistic=-1.322, p-value=0.186
Age=12.0, Variable=Density: t-statistic=1.347, p-value=0.178

Age: 13.0
-------------------------
Age=13.0, Variable=Length: t-statistic=0.930, p-value=0.352
Age=13.0, Variable=Diameter: t-statistic=0.697, p-value=0.486
Age=13.0, Variable=Height: t-statistic=-0.753, p-value=0.452
Age=13.0, Variable=Weight: t-statistic=0.904, p-value=0.366
Age=13.0, Variable=Shucked Weight: t-statistic=2.153, p-value=0.031
Reject null
Age=13.0, Variable=Viscera Weight: t-statistic=-1.045, p-value=0.296
Age=13.0, Variable=Shell Weight: t-statistic=0.445, p-value=0.656
Age=13.0, Variable=Volume: t-statistic=-0.188, p-value=0.851
Age=13.0, Variable=Density: t-statistic=2.691, p-value=0.007
Reject null

Age: 14.0
-------------------------
Age=14.0, Variable=Length: t-statistic=-3.521, p-value=0.000
Reject null
Age=14.0, Variable=Diameter: t-statistic=-3.379, p-value=0.001
Reject null
Age=14.0, Variable=Height: t-statistic=-3.243, p-value=0.001
Reject null
Age=14.0, Variable=Weight: t-statistic=-2.913, p-value=0.004
Reject null
Age=14.0, Variable=Shucked Weight: t-statistic=-1.508, p-value=0.132
Age=14.0, Variable=Viscera Weight: t-statistic=-4.498, p-value=0.000
Reject null
Age=14.0, Variable=Shell Weight: t-statistic=-3.345, p-value=0.001
Reject null
Age=14.0, Variable=Volume: t-statistic=-3.674, p-value=0.000
Reject null
Age=14.0, Variable=Density: t-statistic=2.355, p-value=0.019
Reject null

Age: 15.0
-------------------------
Age=15.0, Variable=Length: t-statistic=0.779, p-value=0.436
Age=15.0, Variable=Diameter: t-statistic=1.064, p-value=0.288
Age=15.0, Variable=Height: t-statistic=0.800, p-value=0.423
Age=15.0, Variable=Weight: t-statistic=1.493, p-value=0.136
Age=15.0, Variable=Shucked Weight: t-statistic=1.995, p-value=0.046
Reject null
Age=15.0, Variable=Viscera Weight: t-statistic=0.320, p-value=0.749
Age=15.0, Variable=Shell Weight: t-statistic=1.132, p-value=0.258
Age=15.0, Variable=Volume: t-statistic=0.811, p-value=0.418
Age=15.0, Variable=Density: t-statistic=1.631, p-value=0.103

Age: 16.0
-------------------------
Age=16.0, Variable=Length: t-statistic=-2.408, p-value=0.016
Reject null
Age=16.0, Variable=Diameter: t-statistic=-2.184, p-value=0.029
Reject null
Age=16.0, Variable=Height: t-statistic=-2.972, p-value=0.003
Reject null
Age=16.0, Variable=Weight: t-statistic=-2.263, p-value=0.024
Reject null
Age=16.0, Variable=Shucked Weight: t-statistic=-1.673, p-value=0.094
Age=16.0, Variable=Viscera Weight: t-statistic=-2.712, p-value=0.007
Reject null
Age=16.0, Variable=Shell Weight: t-statistic=-1.955, p-value=0.051
Age=16.0, Variable=Volume: t-statistic=-2.759, p-value=0.006
Reject null
Age=16.0, Variable=Density: t-statistic=2.423, p-value=0.015
Reject null

Age: 17.0
-------------------------
Age=17.0, Variable=Length: t-statistic=-3.403, p-value=0.001
Reject null
Age=17.0, Variable=Diameter: t-statistic=-3.311, p-value=0.001
Reject null
Age=17.0, Variable=Height: t-statistic=-2.733, p-value=0.006
Reject null
Age=17.0, Variable=Weight: t-statistic=-2.940, p-value=0.003
Reject null
Age=17.0, Variable=Shucked Weight: t-statistic=-2.365, p-value=0.018
Reject null
Age=17.0, Variable=Viscera Weight: t-statistic=-2.782, p-value=0.005
Reject null
Age=17.0, Variable=Shell Weight: t-statistic=-2.794, p-value=0.005
Reject null
Age=17.0, Variable=Volume: t-statistic=-3.327, p-value=0.001
Reject null
Age=17.0, Variable=Density: t-statistic=0.996, p-value=0.319

Age: 18.0
-------------------------
Age=18.0, Variable=Length: t-statistic=-2.309, p-value=0.021
Reject null
Age=18.0, Variable=Diameter: t-statistic=-2.286, p-value=0.022
Reject null
Age=18.0, Variable=Height: t-statistic=-1.769, p-value=0.077
Age=18.0, Variable=Weight: t-statistic=-1.861, p-value=0.063
Age=18.0, Variable=Shucked Weight: t-statistic=-0.241, p-value=0.810
Age=18.0, Variable=Viscera Weight: t-statistic=-2.109, p-value=0.035
Reject null
Age=18.0, Variable=Shell Weight: t-statistic=-2.654, p-value=0.008
Reject null
Age=18.0, Variable=Volume: t-statistic=-2.062, p-value=0.039
Reject null
Age=18.0, Variable=Density: t-statistic=0.321, p-value=0.749

Age: 19.0
-------------------------
Age=19.0, Variable=Length: t-statistic=-1.122, p-value=0.262
Age=19.0, Variable=Diameter: t-statistic=-1.342, p-value=0.180
Age=19.0, Variable=Height: t-statistic=-1.082, p-value=0.280
Age=19.0, Variable=Weight: t-statistic=-1.062, p-value=0.288
Age=19.0, Variable=Shucked Weight: t-statistic=-0.328, p-value=0.743
Age=19.0, Variable=Viscera Weight: t-statistic=-1.273, p-value=0.203
Age=19.0, Variable=Shell Weight: t-statistic=-1.101, p-value=0.271
Age=19.0, Variable=Volume: t-statistic=-1.326, p-value=0.185
Age=19.0, Variable=Density: t-statistic=0.738, p-value=0.461

Age: 20.0
-------------------------
Age=20.0, Variable=Length: t-statistic=-1.611, p-value=0.108
Age=20.0, Variable=Diameter: t-statistic=-1.846, p-value=0.065
Age=20.0, Variable=Height: t-statistic=-1.007, p-value=0.314
Age=20.0, Variable=Weight: t-statistic=-1.079, p-value=0.281
Age=20.0, Variable=Shucked Weight: t-statistic=-0.272, p-value=0.786
Age=20.0, Variable=Viscera Weight: t-statistic=-0.581, p-value=0.561
Age=20.0, Variable=Shell Weight: t-statistic=-1.329, p-value=0.184
Age=20.0, Variable=Volume: t-statistic=-1.490, p-value=0.136
Age=20.0, Variable=Density: t-statistic=1.201, p-value=0.230

Age: 21.0
-------------------------
Age=21.0, Variable=Length: t-statistic=-1.115, p-value=0.265
Age=21.0, Variable=Diameter: t-statistic=-1.146, p-value=0.252
Age=21.0, Variable=Height: t-statistic=-0.273, p-value=0.785
Age=21.0, Variable=Weight: t-statistic=-0.801, p-value=0.423
Age=21.0, Variable=Shucked Weight: t-statistic=-1.085, p-value=0.278
Age=21.0, Variable=Viscera Weight: t-statistic=-1.410, p-value=0.159
Age=21.0, Variable=Shell Weight: t-statistic=-0.282, p-value=0.778
Age=21.0, Variable=Volume: t-statistic=-0.729, p-value=0.466
Age=21.0, Variable=Density: t-statistic=-0.324, p-value=0.746

Age: 22.0
-------------------------
Age=22.0, Variable=Length: t-statistic=-0.310, p-value=0.757
Age=22.0, Variable=Diameter: t-statistic=-0.093, p-value=0.926
Age=22.0, Variable=Height: t-statistic=-0.112, p-value=0.911
Age=22.0, Variable=Weight: t-statistic=-0.128, p-value=0.898
Age=22.0, Variable=Shucked Weight: t-statistic=0.579, p-value=0.563
Age=22.0, Variable=Viscera Weight: t-statistic=-0.131, p-value=0.896
Age=22.0, Variable=Shell Weight: t-statistic=0.263, p-value=0.793
Age=22.0, Variable=Volume: t-statistic=0.006, p-value=0.995
Age=22.0, Variable=Density: t-statistic=0.001, p-value=0.999

Age: 23.0
-------------------------
Age=23.0, Variable=Length: t-statistic=-0.346, p-value=0.730
Age=23.0, Variable=Diameter: t-statistic=-0.030, p-value=0.976
Age=23.0, Variable=Height: t-statistic=-0.674, p-value=0.501
Age=23.0, Variable=Weight: t-statistic=-0.473, p-value=0.637
Age=23.0, Variable=Shucked Weight: t-statistic=0.194, p-value=0.846
Age=23.0, Variable=Viscera Weight: t-statistic=-0.998, p-value=0.319
Age=23.0, Variable=Shell Weight: t-statistic=-1.299, p-value=0.195
Age=23.0, Variable=Volume: t-statistic=-0.325, p-value=0.746
Age=23.0, Variable=Density: t-statistic=-0.059, p-value=0.953

Age: 24.0
-------------------------
Age=24.0, Variable=Length: t-statistic=-0.737, p-value=0.463
Age=24.0, Variable=Diameter: t-statistic=-0.562, p-value=0.575
Age=24.0, Variable=Height: t-statistic=-1.306, p-value=0.195
Age=24.0, Variable=Weight: t-statistic=-0.194, p-value=0.847
Age=24.0, Variable=Shucked Weight: t-statistic=-0.230, p-value=0.818
Age=24.0, Variable=Viscera Weight: t-statistic=-1.029, p-value=0.306
Age=24.0, Variable=Shell Weight: t-statistic=-0.001, p-value=0.999
Age=24.0, Variable=Volume: t-statistic=-0.948, p-value=0.345
Age=24.0, Variable=Density: t-statistic=1.902, p-value=0.060

Age: 25.0
-------------------------
Age=25.0, Variable=Length: t-statistic=-0.621, p-value=0.538
Age=25.0, Variable=Diameter: t-statistic=-0.424, p-value=0.673
Age=25.0, Variable=Height: t-statistic=-1.119, p-value=0.269
Age=25.0, Variable=Weight: t-statistic=-0.623, p-value=0.537
Age=25.0, Variable=Shucked Weight: t-statistic=-1.083, p-value=0.284
Age=25.0, Variable=Viscera Weight: t-statistic=0.105, p-value=0.917
Age=25.0, Variable=Shell Weight: t-statistic=-0.198, p-value=0.844
Age=25.0, Variable=Volume: t-statistic=-0.676, p-value=0.503
Age=25.0, Variable=Density: t-statistic=-0.093, p-value=0.927

Age: 26.0
-------------------------
Age=26.0, Variable=Length: t-statistic=-2.342, p-value=0.024
Reject null
Age=26.0, Variable=Diameter: t-statistic=-2.610, p-value=0.013
Reject null
Age=26.0, Variable=Height: t-statistic=-1.976, p-value=0.055
Age=26.0, Variable=Weight: t-statistic=-2.815, p-value=0.008
Reject null
Age=26.0, Variable=Shucked Weight: t-statistic=-2.998, p-value=0.005
Reject null
Age=26.0, Variable=Viscera Weight: t-statistic=-4.099, p-value=0.000
Reject null
Age=26.0, Variable=Shell Weight: t-statistic=-2.197, p-value=0.034
Reject null
Age=26.0, Variable=Volume: t-statistic=-2.438, p-value=0.019
Reject null
Age=26.0, Variable=Density: t-statistic=-0.401, p-value=0.691

Age: 27.0
-------------------------
Age=27.0, Variable=Length: t-statistic=-0.426, p-value=0.671
Age=27.0, Variable=Diameter: t-statistic=-0.706, p-value=0.482
Age=27.0, Variable=Height: t-statistic=0.947, p-value=0.346
Age=27.0, Variable=Weight: t-statistic=-0.202, p-value=0.840
Age=27.0, Variable=Shucked Weight: t-statistic=-0.052, p-value=0.958
Age=27.0, Variable=Viscera Weight: t-statistic=-1.034, p-value=0.304
Age=27.0, Variable=Shell Weight: t-statistic=0.119, p-value=0.906
Age=27.0, Variable=Volume: t-statistic=-0.071, p-value=0.943
Age=27.0, Variable=Density: t-statistic=-0.035, p-value=0.972

Age: 29.0
-------------------------
Age=29.0, Variable=Length: t-statistic=0.132, p-value=0.895
Age=29.0, Variable=Diameter: t-statistic=-0.664, p-value=0.510
Age=29.0, Variable=Height: t-statistic=-0.499, p-value=0.620
Age=29.0, Variable=Weight: t-statistic=-0.533, p-value=0.596
Age=29.0, Variable=Shucked Weight: t-statistic=0.776, p-value=0.441
Age=29.0, Variable=Viscera Weight: t-statistic=-0.734, p-value=0.466
Age=29.0, Variable=Shell Weight: t-statistic=-0.719, p-value=0.475
Age=29.0, Variable=Volume: t-statistic=-0.665, p-value=0.509
Age=29.0, Variable=Density: t-statistic=0.240, p-value=0.811

Notes on t tests¶

  • It appears that crabs aged 5-11 have sigificant size differences between sexes.
  • There appears to be no difference past 20 years old.
  • When they are first hatched and when they get older, the diffrence becomes scientifically insignificant.

Correlation Matrix in Measurements¶

In [147]:
df_1 = df.drop('id', axis=1)
correlation_matrix = df_1.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix')
plt.show()

Additional Data Analysis¶

  • Normalized Sample Across Ages
  • Split into Age Buckets Discovered in EDA
  • Review Sample
  • Test Male Female Significance in Each Age Group
  • Pairplot Exploration by Age Bucket

Normalized Sample Across Ages¶

Sampling Methodology:¶

Because we know that there is differences in sizes in some ages, when we anlayze, we want to be sure we are not over-representing an age in our data.

3 datasets¶

  • Master dataset with 10,000 records split between age groups.
  • 5-11 years. Equal representation of amoung included age groups.
  • 18+ years. Equal representation of amoung included age groups.
In [148]:
# Setup for sampling
df_filtered = df_1[(~df['Sex'].isin(['I']))]

ages = list(range(4,28))

sample_size = 10000

# Calculate the sample size for each age group and sex
sample_size_per_age_sex = sample_size // (len(ages) * 2)

df_sample = pd.DataFrame()
In [149]:
# Loop through each age group
for age in ages:
    for sex in ("M","F"):
        age_sex_group_sample = df_filtered[(df_filtered['Age'] == age) & (df_filtered['Sex'] == sex)].sample(n=sample_size_per_age_sex, replace=True)
        df_sample = pd.concat([df_sample, age_sex_group_sample])

# Reset the index
df_sample = df_sample.reset_index(drop=True)

Split into Age Buckets Discovered in EDA¶

In [150]:
df_age_5_11 = df_sample[df_sample['Age'].between(5, 11)]
# print(df_age_5_11.describe())
df_age_gt_18 = df_sample[df_sample['Age'] > 18]
# print(df_age_gt_18.describe())

Review Sample¶

In [177]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))  # Create subplots with 1 row and 2 columns
for i, d in enumerate([df_filtered, df_sample]):
    ax = axes[i]  # Get the corresponding axis for each subplot
    sns.kdeplot(data=d, x='Age', hue='Sex', ax=ax)
    ax.set_title('Age Distribution Density by Sex')
    ax.set_xlabel('Age')
    ax.set_ylabel('Frequency')
    sex_categories = d['Sex'].unique()
    legend_labels = [f'Sex: {sex}' for sex in sex_categories]
    ax.legend(legend_labels, title='Sex')
plt.tight_layout()  # Adjust spacing between subplots
plt.show()
C:\Users\darkc\anaconda3\lib\site-packages\seaborn\distributions.py:316: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning.
  warnings.warn(msg, UserWarning)

Test Male Female Significance in Each Age Group¶

In [152]:
print("Age > 18")
print("-------------------------")
for var in variables:
    group1 = df_age_gt_18[df_age_gt_18['Sex'] == 'M'][var]
    group2 = df_age_gt_18[df_age_gt_18['Sex'] == 'F'][var]
    t_statistic, p_value = stats.ttest_ind(group1, group2)
    print(f"Variable: {var}")
    print(f"For Males and Females: t-statistic={t_statistic:.3f}, p-value={p_value:.3f}")
    if p_value < 0.05:
        print("Reject null")
    print("-------------------------")
Age > 18
-------------------------
Variable: Length
For Males and Females: t-statistic=-4.702, p-value=0.000
Reject null
-------------------------
Variable: Diameter
For Males and Females: t-statistic=-4.479, p-value=0.000
Reject null
-------------------------
Variable: Height
For Males and Females: t-statistic=-4.612, p-value=0.000
Reject null
-------------------------
Variable: Weight
For Males and Females: t-statistic=-4.100, p-value=0.000
Reject null
-------------------------
Variable: Shucked Weight
For Males and Females: t-statistic=-2.344, p-value=0.019
Reject null
-------------------------
Variable: Viscera Weight
For Males and Females: t-statistic=-5.526, p-value=0.000
Reject null
-------------------------
Variable: Shell Weight
For Males and Females: t-statistic=-4.056, p-value=0.000
Reject null
-------------------------
Variable: Volume
For Males and Females: t-statistic=-4.800, p-value=0.000
Reject null
-------------------------
Variable: Density
For Males and Females: t-statistic=1.877, p-value=0.061
-------------------------
In [153]:
print("Age 5-11")
print("-------------------------")
for var in variables:
    group1 = df_age_5_11[df_age_5_11['Sex'] == 'M'][var]
    group2 = df_age_5_11[df_age_5_11['Sex'] == 'F'][var]
    t_statistic, p_value = stats.ttest_ind(group1, group2)
    print(f"Variable: {var}")
    print(f"For Males and Females: t-statistic={t_statistic:.3f}, p-value={p_value:.3f}")
    if p_value < 0.05:
        print("Reject null")
    print("-------------------------")
Age 5-11
-------------------------
Variable: Length
For Males and Females: t-statistic=-4.601, p-value=0.000
Reject null
-------------------------
Variable: Diameter
For Males and Females: t-statistic=-4.556, p-value=0.000
Reject null
-------------------------
Variable: Height
For Males and Females: t-statistic=-4.492, p-value=0.000
Reject null
-------------------------
Variable: Weight
For Males and Females: t-statistic=-3.614, p-value=0.000
Reject null
-------------------------
Variable: Shucked Weight
For Males and Females: t-statistic=-3.447, p-value=0.001
Reject null
-------------------------
Variable: Viscera Weight
For Males and Females: t-statistic=-4.102, p-value=0.000
Reject null
-------------------------
Variable: Shell Weight
For Males and Females: t-statistic=-3.733, p-value=0.000
Reject null
-------------------------
Variable: Volume
For Males and Females: t-statistic=-3.817, p-value=0.000
Reject null
-------------------------
Variable: Density
For Males and Females: t-statistic=-0.407, p-value=0.684
-------------------------

Notes on t testing subsets

  • I tested my two subsets expecting one to be significant an the other not to. They are both significant. I assume this is because of un-equal samples across ages. this would nullify significance testing if age is not controlled for.

  • Re-did the analysis with after normalizing the sample across ages and found that density and shucked weight are the most similar in old the old crab group.

In [154]:
# Create PairGrid plot for dataset_5_11
g1 = sns.PairGrid(df_age_5_11, hue='Sex', palette=['blue', 'orange'], hue_order=['M', 'F'])
g1.map_upper(sns.scatterplot)
g1.map_lower(sns.scatterplot)
#g1.map_lower(sns.kdeplot)
g1.map_diag(sns.histplot)
g1.add_legend(title='Sex')

plt.tight_layout()
plt.show()
In [155]:
g2 = sns.PairGrid(df_age_gt_18, hue='Sex', palette=['blue', 'orange'], hue_order=['M', 'F'])
g2.map_upper(sns.scatterplot)
g2.map_lower(sns.scatterplot)
#g2.map_lower(sns.kdeplot)
g2.map_diag(sns.histplot)
g2.add_legend(title='Sex')

plt.tight_layout()
plt.show()

Notes on Pair Plot¶

  • The differences between sexes is not apparent visually here in the scatter plot.
  • There are some interesting asymtote, liniar and curved relationships. There appear to be finite sizes the crabs can reach.
In [156]:
# Create a box plot to compare the distributions of Age for male and female crabs
sns.boxplot(x='Sex', y='Volume', data=df_age_5_11)

# Set plot labels
plt.xlabel('Sex')
plt.ylabel('Volume')

# Set plot title
plt.title('Distribution of Volume by Sex')

# Display the plot
plt.show()
In [157]:
# Create a box plot to compare the distributions of Age for male and female crabs
sns.boxplot(x='Sex', y='Volume', data=df_age_gt_18)

# Set plot labels
plt.xlabel('Sex')
plt.ylabel('Volume')

# Set plot title
plt.title('Distribution of Volume by Sex')

# Display the plot
plt.show()

notes on additional plots¶

Hard to spot differences in the sexes. Will need to explore by age in order to really dig into it.

Model Crab Size Over Time¶

  • Line of Age x Average Dimension and Confidence Interval.
  • Regression of Age x Weight in each of the three age buckets
  • Smoothed Regression line
  • Analysis of Variance Between 3 Age Buckets

Line of Age x Average Dimension and Confidence Interval.¶

In [158]:
grouped_data = df_sample.groupby('Sex')
fig, axes = plt.subplots(nrows=len(variables), ncols=1, figsize=(8, 16))
plt.subplots_adjust(hspace=0.7)
for i, variable in enumerate(variables): # Variable Iteration Loop
    colors = ['pink', 'blue'] # Initialize colors for males and females
    for j, (sex, group) in enumerate(grouped_data):  # One line per sex
        # Calculate the mean and confidence interval
        mean = group.groupby('Age')[variable].mean()
        lower_ci = group.groupby('Age')[variable].quantile(0.25)
        upper_ci = group.groupby('Age')[variable].quantile(0.75)
        axes[i].plot(mean.index, mean, label=sex, color=colors[j]) # mean
        axes[i].fill_between(mean.index, lower_ci, upper_ci, alpha=0.3, color=colors[j]) # Confidence interval   
    axes[i].set_title(variable)  # Set the title and labels for each variable subplot
    axes[i].set_xlabel('Age')
    axes[i].set_ylabel(variable)
    axes[i].legend()  # legend
plt.tight_layout() # Adjust the figure layout and spacing
plt.show() # Display the plot

Regression of Age x Weight 5 - 11 years¶

In [178]:
males = df_age_5_11[df_age_5_11['Sex'] == 'M'].copy()
females = df_age_5_11[df_age_5_11['Sex'] == 'F'].copy()

males['Age_jittered'] = males['Age'] + np.random.uniform(-0.4, 0.4, len(males))
females['Age_jittered'] = females['Age'] + np.random.uniform(-0.4, 0.4, len(females))
In [179]:
X_males = males[['Age']]
y_males = males['Weight']
X_males = sm.add_constant(X_males)
model_males = sm.OLS(y_males, X_males).fit()  
X_females = females[['Age']]
y_females = females['Weight']
X_females = sm.add_constant(X_females) 
model_females = sm.OLS(y_females, X_females).fit()
In [180]:
# Model summaries
print("Male Regression Summary:")
print(model_males.summary())
print("\nFemale Regression Summary:")
print(model_females.summary())
Male Regression Summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 Weight   R-squared:                       0.675
Model:                            OLS   Adj. R-squared:                  0.675
Method:                 Least Squares   F-statistic:                     3021.
Date:                Wed, 28 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:05:40   Log-Likelihood:                -4933.7
No. Observations:                1456   AIC:                             9871.
Df Residuals:                    1454   BIC:                             9882.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -22.5299      0.775    -29.070      0.000     -24.050     -21.010
Age            5.1657      0.094     54.963      0.000       4.981       5.350
==============================================================================
Omnibus:                      119.933   Durbin-Watson:                   1.939
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              304.625
Skew:                           0.461   Prob(JB):                     7.10e-67
Kurtosis:                       5.042   Cond. No.                         34.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Female Regression Summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 Weight   R-squared:                       0.660
Model:                            OLS   Adj. R-squared:                  0.660
Method:                 Least Squares   F-statistic:                     2820.
Date:                Wed, 28 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:05:40   Log-Likelihood:                -4923.0
No. Observations:                1456   AIC:                             9850.
Df Residuals:                    1454   BIC:                             9861.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -19.1803      0.769    -24.930      0.000     -20.690     -17.671
Age            4.9545      0.093     53.104      0.000       4.771       5.138
==============================================================================
Omnibus:                       73.362   Durbin-Watson:                   1.887
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              148.565
Skew:                           0.337   Prob(JB):                     5.49e-33
Kurtosis:                       4.412   Cond. No.                         34.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [181]:
# Scatter plot
plt.figure(figsize=(4.5, 3.6))
sns.scatterplot(data=males, x='Age_jittered', y='Weight', color='blue', alpha=0.7)
sns.scatterplot(data=females, x='Age_jittered', y='Weight', color='pink', alpha=0.7)
sns.lineplot(x=males['Age'], y=model_males.predict(X_males), color='blue', label=f'Males: {model_males.params[1]:.2f} * Age + {model_males.params[0]:.2f}')
sns.lineplot(x=females['Age'], y=model_females.predict(X_females), color='red', label=f'Females: {model_females.params[1]:.2f} * Age + {model_females.params[0]:.2f}')
plt.title('Regression Analysis (Age 12-17 vs. Weight)')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()
plt.show()

Regression of Age x Weight 12 - 18 years¶

In [182]:
df_age_12_18 = df_sample[(df_sample['Age'] >= 12) & (df_sample['Age'] <= 18)]

males = df_age_12_18[df_age_12_18['Sex'] == 'M'].copy()
females = df_age_12_18[df_age_12_18['Sex'] == 'F'].copy()

males['Age_jittered'] = males['Age'] + np.random.uniform(-0.4, 0.4, len(males))
females['Age_jittered'] = females['Age'] + np.random.uniform(-0.4, 0.4, len(females))
In [183]:
X_males = males[['Age']]
y_males = males['Weight']
X_males = sm.add_constant(X_males)
model_males = sm.OLS(y_males, X_males).fit()  
X_females = females[['Age']]
y_females = females['Weight']
X_females = sm.add_constant(X_females) 
model_females = sm.OLS(y_females, X_females).fit()
In [184]:
# Model summaries
print("Male Regression Summary:")
print(model_males.summary())

print("\nFemale Regression Summary:")
print(model_females.summary())
Male Regression Summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 Weight   R-squared:                       0.014
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     21.26
Date:                Wed, 28 Jun 2023   Prob (F-statistic):           4.36e-06
Time:                        17:05:51   Log-Likelihood:                -5336.6
No. Observations:                1456   AIC:                         1.068e+04
Df Residuals:                    1454   BIC:                         1.069e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         23.6153      1.876     12.590      0.000      19.936      27.295
Age            0.5715      0.124      4.611      0.000       0.328       0.815
==============================================================================
Omnibus:                       40.345   Durbin-Watson:                   1.901
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.456
Skew:                           0.333   Prob(JB):                     1.82e-11
Kurtosis:                       3.609   Cond. No.                         115.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Female Regression Summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 Weight   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     19.58
Date:                Wed, 28 Jun 2023   Prob (F-statistic):           1.04e-05
Time:                        17:05:51   Log-Likelihood:                -5439.1
No. Observations:                1456   AIC:                         1.088e+04
Df Residuals:                    1454   BIC:                         1.089e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         23.7868      2.013     11.819      0.000      19.839      27.735
Age            0.5885      0.133      4.425      0.000       0.328       0.849
==============================================================================
Omnibus:                       60.449   Durbin-Watson:                   2.023
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               77.117
Skew:                           0.427   Prob(JB):                     1.80e-17
Kurtosis:                       3.736   Cond. No.                         115.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [185]:
# Scatter plot
plt.figure(figsize=(4.5, 3.6))
sns.scatterplot(data=males, x='Age_jittered', y='Weight', color='blue', alpha=0.7)
sns.scatterplot(data=females, x='Age_jittered', y='Weight', color='pink', alpha=0.7) 
sns.lineplot(x=males['Age'], y=model_males.predict(X_males), color='blue', label=f'Males: {model_males.params[1]:.2f} * Age + {model_males.params[0]:.2f}')
sns.lineplot(x=females['Age'], y=model_females.predict(X_females), color='red', label=f'Females: {model_females.params[1]:.2f} * Age + {model_females.params[0]:.2f}')
plt.title('Regression Analysis (Age 12-18 vs. Weight)')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()
plt.show()

Regression of Age x Weight 18+ years¶

In [196]:
df_age_gt_18 = df_age_gt_18[(df_age_gt_18['Age'] != 25)]

males = df_age_gt_18[df_age_gt_18['Sex'] == 'M'].copy()
females = df_age_gt_18[df_age_gt_18['Sex'] == 'F'].copy()

males['Age_jittered'] = males['Age'] + np.random.uniform(-0.4, 0.4, len(males))
females['Age_jittered'] = females['Age'] + np.random.uniform(-0.4, 0.4, len(females))
In [197]:
# Regression analysis for males
X_males = males[['Age']]
y_males = males['Weight']
X_males = sm.add_constant(X_males)
model_males = sm.OLS(y_males, X_males).fit()  
X_females = females[['Age']]
y_females = females['Weight']
X_females = sm.add_constant(X_females) 
model_females = sm.OLS(y_females, X_females).fit()
In [198]:
print("Male Regression Summary:")
print(model_males.summary())
print("\nFemale Regression Summary:")
print(model_females.summary())
Male Regression Summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 Weight   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                   0.04924
Date:                Wed, 28 Jun 2023   Prob (F-statistic):              0.824
Time:                        17:09:44   Log-Likelihood:                -6082.9
No. Observations:                1664   AIC:                         1.217e+04
Df Residuals:                    1662   BIC:                         1.218e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         35.9300      1.997     17.994      0.000      32.014      39.847
Age           -0.0193      0.087     -0.222      0.824      -0.190       0.152
==============================================================================
Omnibus:                       29.696   Durbin-Watson:                   1.901
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               36.696
Skew:                           0.242   Prob(JB):                     1.08e-08
Kurtosis:                       3.543   Cond. No.                         200.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Female Regression Summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 Weight   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     10.66
Date:                Wed, 28 Jun 2023   Prob (F-statistic):            0.00112
Time:                        17:09:44   Log-Likelihood:                -6139.2
No. Observations:                1664   AIC:                         1.228e+04
Df Residuals:                    1662   BIC:                         1.229e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         30.0529      2.065     14.550      0.000      26.002      34.104
Age            0.2944      0.090      3.265      0.001       0.118       0.471
==============================================================================
Omnibus:                       66.022   Durbin-Watson:                   1.907
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               81.250
Skew:                           0.430   Prob(JB):                     2.27e-18
Kurtosis:                       3.658   Cond. No.                         200.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [199]:
# Scatter plot
plt.figure(figsize=(4.5, 3.6))
sns.scatterplot(data=males, x='Age_jittered', y='Weight', color='blue', alpha=0.7)
sns.scatterplot(data=females, x='Age_jittered', y='Weight', color='pink', alpha=0.7)

# Regression lines and formulas for males and females
sns.lineplot(x=males['Age'], y=model_males.predict(X_males), color='blue', label=f'Males: {model_males.params[1]:.2f} * Age + {model_males.params[0]:.2f}')
sns.lineplot(x=females['Age'], y=model_females.predict(X_females), color='red', label=f'Females: {model_females.params[1]:.2f} * Age + {model_females.params[0]:.2f}')

plt.title('Regression Analysis (Age 19 + vs. Weight)')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()

plt.show()
  • Age 25 females are quite a bit above the males. I will re-try the old regresstion and omit them.

Smoothed Regression¶

In [201]:
age_range = np.arange(0, 29)

# Weight calculation based on age range
weights_male = np.zeros_like(age_range, dtype=float)
weights_female = np.zeros_like(age_range, dtype=float)

for i, age in enumerate(age_range):
    if age < 5:
        weights_male[i] = 0
        weights_female[i] = 0
    elif age >= 5 and age <= 11:
        weights_male[i] = 5.08 * age - 21.74
        weights_female[i] = 4.71 * age - 17.56
    elif age >= 12 and age <= 18:
        weights_male[i] = 0.57 * age + 23.6
        weights_female[i] = 0.59 * age + 23.7
    else:
        weights_male[i] = -0.02 * age + 35.9
        weights_female[i] = 0.29 * age + 30

# Moving average for smoothing
window_size = 5 
weights_male_smoothed = np.convolve(weights_male, np.ones(window_size)/window_size, mode='same')
weights_female_smoothed = np.convolve(weights_female, np.ones(window_size)/window_size, mode='same')

# Plot Graph
plt.figure(figsize=(8, 6))
plt.plot(age_range, weights_male_smoothed, label='Males')
plt.plot(age_range, weights_female_smoothed, label='Females')

plt.xlim(0, 25) 
plt.title('Weight Prediction by Age (Smoothed)')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()
plt.show()

image-2.png

Analysis of Variance Between 3 Age Buckets¶

In [202]:
from scipy import stats

# Extract the 'Weight' data for each dataset
weight_gt_18 = df_age_gt_18['Weight']
weight_12_18 = df_age_12_18['Weight']
weight_5_11 = df_age_5_11['Weight']

# Perform ANOVA test
fvalue, pvalue = stats.f_oneway(weight_gt_18, weight_12_18, weight_5_11)

# Print the results
print("ANOVA Results:")
print("F-value:", fvalue)
print("p-value:", pvalue)
ANOVA Results:
F-value: 2002.5639636706694
p-value: 0.0
In [203]:
# Combine the weight data from all three groups
weight_data = np.concatenate([weight_gt_18, weight_12_18, weight_5_11])

# orresponding group labels array
group_labels = ['18+', '12-18', '5-11']
group_labels = np.repeat(group_labels, [len(weight_gt_18), len(weight_12_18), len(weight_5_11)])

# Perform Tukey's HSD test
tukey_results = pairwise_tukeyhsd(weight_data, group_labels)
print(tukey_results)
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
=====================================================
group1 group2 meandiff p-adj  lower    upper   reject
-----------------------------------------------------
 12-18    18+   3.7194   0.0   3.0861   4.3527   True
 12-18   5-11 -12.7755   0.0 -13.4296 -12.1215   True
   18+   5-11 -16.4949   0.0 -17.1282 -15.8617   True
-----------------------------------------------------

Conclusion¶

  • Life stages appear produce far more significant size differences than sex.
  • We are able to approximate 4 separate life stages for these crabs
  1. Baby crabs: 0-4 years. Baby crabs appeared to be difficult to sample as is evident by few observations present.
  2. Young crabs: 5-11 years. A period of rapid growth
  3. Mature crabs: 12-18 years. Growth rate slows down
  4. Old crabs: 19+ years. Crabs can only grow so big.